library("tidyverse")
library("ComplexHeatmap")
library("CTexploreR")
library("circlize")
library("DESeq2")
library("SummarizedExperiment")
library("SingleCellExperiment")
library("DropletUtils")
library("scater")
library("scran")
library("scuttle")
legends_param <- list(
labels_gp = gpar(col = "black", fontsize = 3),
title_gp = gpar(col = "black", fontsize = 3, fontface = "bold"),
simple_anno_size = unit(0.1, "cm"),
row_names_gp = gpar(fontsize = 1),
annotation_name_side = "left",
border = FALSE,
border_gp = gpar(lwd = 0.1),
grid_width = unit(0.1, "cm"),
grid_height = unit(0.01, "cm"),
legend_height = unit(0.1, "cm"))
CT_genes_X_met <- all_genes %>%
filter(CT_gene_type == "CT_gene") %>%
filter(X_linked, regulated_by_methylation) %>%
pull(external_gene_name)
Single-Cell RNA-Seq Reveals Lineage and X Chromosome Dynamics in Human Preimplantation Embryos Petropoulos et al, Cell 2016
# Start from raw counts
load(file = "/storage/research/dduv/cbio-lg/cluster/DataSets/early_embryos/Petropoulos_dataset/data/Petropoulos_sce_raw_counts.rda")
sce <- petropoulos_sce_raw_counts
rm(petropoulos_sce_raw_counts)
assayNames(sce) <- "counts"
table(sce$lineage, sce$day)
##
## E3 E4 E5 E6 E7
## epiblast 0 0 41 45 41
## not applicable 81 190 152 0 0
## primitive endoderm 0 0 32 39 37
## trophectoderm 0 0 142 293 388
as_tibble(colData(sce)) %>%
filter(day == "E3") %>%
ggplot(aes(x = individual, y = gender, color = gender)) +
geom_jitter(width = 0.1)+
theme(axis.text.x = element_text(angle = 90))+
facet_wrap(~ day)
# "E3.1", "E3.2", "E3.3" are ambigous
sce <- sce[, !sce$individual %in% c("E3.1", "E3.2", "E3.3")]
as_tibble(colData(sce)) %>%
filter(day == "E4") %>%
ggplot(aes(x = individual, y = gender, color = gender)) +
geom_jitter(width = 0.1)+
theme(axis.text.x = element_text(angle = 90))+
facet_wrap(~ day)
as_tibble(colData(sce)) %>%
filter(day == "E5") %>%
ggplot(aes(x = individual, y = gender, color = gender)) +
geom_jitter(width = 0.1)+
theme(axis.text.x = element_text(angle = 90))+
facet_wrap(~ day)
# "E5.40" is likely a Female
sce$gender[sce$individual == "E5.40"] <- "F"
as_tibble(colData(sce)) %>%
filter(day == "E6") %>%
ggplot(aes(x = individual, y = gender, color = gender)) +
geom_jitter(width = 0.1)+
theme(axis.text.x = element_text(angle = 90))+
facet_wrap(~ day)
# "E6.7" is likely a Male
sce$gender[sce$individual == "E6.7"] <- "M"
as_tibble(colData(sce)) %>%
filter(day == "E7") %>%
ggplot(aes(x = individual, y = gender, color = gender)) +
geom_jitter(width = 0.1)+
theme(axis.text.x = element_text(angle = 90))+
facet_wrap(~ day)
# No mitochondrial genes (not in GTF or in genome ref used for alignment?)
# grep("^MT-", x = rownames(sce), value = TRUE)
sce <- sce[rowSums(assay(sce)) > 0,]
sce$gender <- factor(sce$gender, levels = c("M", "F"))
rowData(sce)$is_mito <- FALSE
rowData(sce)$is_mito[grep(pattern = "^MT-", x = rowData(sce)$gene, value = FALSE)] <- TRUE
sce <- addPerCellQC(sce, subsets=list(Mito = rowData(sce)$is_mito))
# No MT genes
#table(sce$subsets_Mito_percent)
table(sce$day, sce$gender)
##
## M F
## E3 35 28
## E4 98 92
## E5 206 161
## E6 142 235
## E7 176 290
thr_detection <- 7500
thr_detection <- 0
# Define outliers with low detection level
#outliers <- colnames(sce)[isOutlier(sce$detected, type = "lower")]
outliers <- colnames(sce)[sce$detected < thr_detection]
##outliers
sce$outlier_detected <- FALSE
sce$outlier_detected[sce$sample %in% outliers] <- TRUE
f1 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = detected)) +
geom_jitter(aes(x = 1, y = detected, color = outlier_detected)) +
geom_violin(alpha = 0.5, outliers = F) +
geom_hline(yintercept = thr_detection, linetype = "dashed") +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Number of detected genes")
# as_tibble(colData(sce)) %>%
# ggplot(aes(x = 1, y = detected)) +
# geom_jitter(aes(x = 1, y = detected, color = source_name)) +
# geom_violin(alpha = 0.5, outliers = F)
f2 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = detected)) +
geom_jitter(aes(x = 1, y = detected, color = day)) +
geom_violin(alpha = 0.5, outliers = F) +
#scale_color_manual(values = c("blue", "cyan3"))+
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())+
ggtitle("Number of detected genes")
gridExtra::grid.arrange(f2, f1, ncol = 3)
thr_sum <- 1000000
thr_sum <- 0
#Define outliers with low detection level
#outliers <- colnames(sce)[isOutlier(sce$sum, type = "lower")]
outliers <- colnames(sce)[sce$sum < thr_sum]
sce$outlier_sum <- FALSE
sce$outlier_sum[sce$sample %in% outliers] <- TRUE
f1 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = sum)) +
geom_jitter(aes(x = 1, y = sum, color = outlier_sum)) +
geom_violin(alpha = 0.5, outliers = F) +
geom_hline(yintercept = thr_sum, linetype = "dashed") +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Total number of reads")
f3 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = sum)) +
geom_jitter(aes(x = 1, y = sum, color = day)) +
geom_violin(alpha = 0.5, outliers = F) +
#geom_hline(yintercept = thr_sum, linetype = "dashed") +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggtitle("Total number of reads")
# as_tibble(colData(sce)) %>%
# ggplot(aes(x = 1, y = detected)) +
# geom_jitter(aes(x = 1, y = detected, color = source_name)) +
# geom_violin(alpha = 0.5, outliers = F)
gridExtra::grid.arrange(f3, f1, ncol = 2)
outliers <- as_tibble(colData(sce)) %>%
filter(outlier_sum | outlier_detected) %>%
pull(sample)
sce$outlier <- FALSE
sce$outlier[sce$sample %in% outliers] <- TRUE
# Define outliers with low sum value
f1 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = detected)) +
geom_jitter(aes(x = 1, y = detected, color = outlier)) +
geom_violin(alpha = 0.5, outliers = F) +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
f2 <- as_tibble(colData(sce)) %>%
ggplot(aes(x = 1, y = sum)) +
geom_jitter(aes(x = 1, y = sum, color = outlier)) +
geom_violin(alpha = 0.5, outliers = F) +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
gridExtra::grid.arrange(f1, f2, ncol = 2)
=> Removed outlier cells
table(outlier =sce$outlier)
## outlier
## FALSE
## 1463
sce <- sce[, !sce$outlier]
=> 1463 cells kept
table(sce$day, sce$gender)
##
## M F
## E3 35 28
## E4 98 92
## E5 206 161
## E6 142 235
## E7 176 290
Checked that the data’s quality is independant of sex factor:
a <- as_tibble(colData(sce)) %>%
ggplot(aes(x = gender, y = detected)) +
geom_jitter(aes(x = gender, y = detected, color = gender)) +
geom_boxplot(alpha = 0.5, outliers = F) +
scale_color_manual(values = list("M" = "steelblue", "F" = "deeppink")) +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())+
ggtitle("Number of detected genes")
b <- as_tibble(colData(sce)) %>%
ggplot(aes(x = gender, y = sum)) +
geom_jitter(aes(x = gender, y = sum, color = gender)) +
geom_boxplot(alpha = 0.5, outliers = F) +
scale_color_manual(values = list("M" = "steelblue", "F" = "deeppink")) +
theme(legend.position = "bottom",
legend.title.position = "top",
legend.title = element_text(size = 8),
legend.text = element_text(size = 6),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())+
ggtitle("Sequencing Depth")
gridExtra::grid.arrange(a,b, ncol = 2)
## Normalisation
sce <- logNormCounts(sce, transform = "none")
sce <- logNormCounts(sce)
## Feature selection
hvg <- modelGeneVar(sce)
fit_hvg <- metadata(hvg)
# plot(fit_hvg$mean, fit_hvg$var,
# xlab = "Mean of log-expression",
# ylab = "Variance of log-expression",
# col = ifelse(hvg$p.value < 0.1, "red", "black"))
# curve(fit_hvg$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
topHvg <- getTopHVGs(hvg, n = 500)
rowSubset(sce) <- topHvg
## PCA
set.seed(123)
sce <- fixedPCA(sce, rank = 100, subset.row = topHvg) # rank = 50 by default
plotPCA(sce, colour_by = "day", shape = "lineage")
plotPCA(sce, colour_by = "gender")
plotPCA(sce, colour_by = "lineage")
idem but focus on E4
n_cells_per_individual <- enframe(table(sce$individual), name = "individual", value = "n_cells")
sce_reduced <- sce[, sce$day == "E4"]
hvg <- modelGeneVar(sce_reduced)
fit_hvg <- metadata(hvg)
# plot(fit_hvg$mean, fit_hvg$var,
# xlab = "Mean of log-expression",
# ylab = "Variance of log-expression",
# col = ifelse(hvg$p.value < 0.1, "red", "black"))
# curve(fit_hvg$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
topHvg <- getTopHVGs(hvg, n = 500)
rowSubset(sce_reduced) <- topHvg
# plotPCA(sce_reduced, colour_by = "day", shape = "lineage")
# plotPCA(sce_reduced, colour_by = "gender")
selected <- n_cells_per_individual %>% filter(n_cells> 5) %>% pull(individual)
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "individual")
idem but focus on E5
n_cells_per_individual <- enframe(table(sce$individual), name = "individual", value = "n_cells")
sce_reduced <- sce[, sce$day == "E5"]
hvg <- modelGeneVar(sce_reduced)
fit_hvg <- metadata(hvg)
# plot(fit_hvg$mean, fit_hvg$var,
# xlab = "Mean of log-expression",
# ylab = "Variance of log-expression",
# col = ifelse(hvg$p.value < 0.1, "red", "black"))
# curve(fit_hvg$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
topHvg <- getTopHVGs(hvg, n = 500)
rowSubset(sce_reduced) <- topHvg
#
# plotPCA(sce_reduced, colour_by = "day", shape = "lineage")
# plotPCA(sce_reduced, colour_by = "gender")
selected <- n_cells_per_individual %>% filter(n_cells> 10) %>% pull(individual)
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "individual")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "gender")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "sum")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "detected")
idem but focus on E6
n_cells_per_individual <- enframe(table(sce$individual), name = "individual", value = "n_cells")
sce_reduced <- sce[, sce$day == "E6"]
hvg <- modelGeneVar(sce_reduced)
fit_hvg <- metadata(hvg)
# plot(fit_hvg$mean, fit_hvg$var,
# xlab = "Mean of log-expression",
# ylab = "Variance of log-expression",
# col = ifelse(hvg$p.value < 0.1, "red", "black"))
# curve(fit_hvg$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
topHvg <- getTopHVGs(hvg, n = 500)
rowSubset(sce_reduced) <- topHvg
# plotPCA(sce_reduced, colour_by = "day", shape = "lineage")
# plotPCA(sce_reduced, colour_by = "gender")
selected <- n_cells_per_individual %>% filter(n_cells> 10) %>% pull(individual)
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "individual")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "gender")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "lineage")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "sum")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "detected")
idem but focus on E7
n_cells_per_individual <- enframe(table(sce$individual), name = "individual", value = "n_cells")
sce_reduced <- sce[, sce$day == "E7"]
hvg <- modelGeneVar(sce_reduced)
fit_hvg <- metadata(hvg)
# plot(fit_hvg$mean, fit_hvg$var,
# xlab = "Mean of log-expression",
# ylab = "Variance of log-expression",
# col = ifelse(hvg$p.value < 0.1, "red", "black"))
# curve(fit_hvg$trend(x), col = "dodgerblue", add = TRUE, lwd = 2)
topHvg <- getTopHVGs(hvg, n = 500)
rowSubset(sce_reduced) <- topHvg
#plotPCA(sce_reduced, colour_by = "day", shape = "lineage")
#plotPCA(sce_reduced, colour_by = "gender")
selected <- n_cells_per_individual %>% filter(n_cells> 10) %>% pull(individual)
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "individual")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "gender")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "lineage")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "sum")
plotPCA(sce_reduced[, sce_reduced$individual %in% selected], colour_by = "detected")
table(sce$lineage, sce$day, sce$gender)
## , , = M
##
##
## E3 E4 E5 E6 E7
## epiblast 0 0 17 4 19
## not applicable 35 98 99 0 0
## primitive endoderm 0 0 18 14 23
## trophectoderm 0 0 72 124 134
##
## , , = F
##
##
## E3 E4 E5 E6 E7
## epiblast 0 0 24 41 22
## not applicable 28 92 53 0 0
## primitive endoderm 0 0 14 25 14
## trophectoderm 0 0 70 169 254
legends_param <- list(
labels_gp = gpar(col = "black", fontsize = 10),
title_gp = gpar(col = "black", fontsize = 10, fontface = "bold"),
simple_anno_size = unit(0.1, "cm"),
row_names_gp = gpar(fontsize = 8),
annotation_name_side = "left",
border = FALSE,
border_gp = gpar(lwd = 0.1),
grid_width = unit(0.3, "cm"),
grid_height = unit(0.3, "cm"),
legend_height = unit(0.3, "cm"))
legend_colors <- c(
"#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
"#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
"#9E0142")
ensembl_gene <- data.frame(gene = all_genes$external_gene_name)
rownames(ensembl_gene) <- all_genes$ensembl_gene_id
draw_embryo_exp_Petropoulos_dataset <- function(genes,
detected_in_percent_of_cells = 0,
day = NULL,
font_size = 6,
gender = NULL,
scale = FALSE,
assayName = "normcounts",
scale_lims = NULL,
clustRow = TRUE,
h_width = 8,
h_height = 8,
show_right_annot = TRUE,
clust_method = "centroid"){
# Top Annotations
df_col <- colData(sce)
df_col <- df_col[!is.na(df_col$gender),]
if (!is.null(day)) df_col <- df_col[df_col$day %in% day, ]
df_col$sex <- df_col$gender
levels(df_col$sex) <- c("Male", "Female")
df_col <- df_col[order(df_col$day, df_col$sex, df_col$individual, df_col$lineage),]
df_col$group <- paste0(df_col$day, "_", df_col$gender)
column_ha_sex = HeatmapAnnotation(
sex = df_col$sex,
col = list(sex = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_individual = HeatmapAnnotation(
individual = df_col$individual,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_lineage = HeatmapAnnotation(
lineage = df_col$lineage,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_day = HeatmapAnnotation(
day = df_col$day,
#col = list(Stage = c("ICM" = "gray", "TE" = "gray10")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_sum = HeatmapAnnotation(
sum = df_col$sum,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_detected = HeatmapAnnotation(
detected = df_col$detected,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
n <- which(assayNames(sce) == assayName)
mat <- assay(sce, i = n)[genes, ]
# Show only genes detected in more than x% of cells
mat <- mat[rowSums(mat > 0) >= dim(mat)[2]*detected_in_percent_of_cells,]
# gene_type <- CT_genes_types %>%
# filter(gene %in% genes) %>%
# mutate(gene = factor(gene, levels = genes)) %>%
# arrange(gene)
if (scale) {
mat <- t(scale(t(mat), center = FALSE, scale = TRUE))
assayName = "scaled_exp"
}
if (is.null(scale_lims)) scale_lims <- c(0, quantile(mat, .9))
heatmap_colors <- colorRamp2(
seq(scale_lims[1], scale_lims[2], length = 11), legend_colors)
lines <- tibble(gene = rownames(mat)) %>%
mutate(group = case_when(gene %in% CT_genes$external_gene_name ~ "X-linked CG MethDep",
!gene %in% CT_genes$external_gene_name ~ "Control")) %>%
mutate(group = factor(group, levels = c("X-linked CG MethDep", "Control")))
ht1 <- Heatmap(mat[, rownames(df_col)],
name = assayName,
clustering_method_rows = "ward.D2",
clustering_method_columns = "ward.D2",
column_split = df_col$group,
row_split = lines$group,
row_gap = unit(c(1), "mm"),
col = heatmap_colors,
cluster_rows = clustRow,
cluster_columns = FALSE,
cluster_row_slices = FALSE,
cluster_column_slices = FALSE,
show_row_names = TRUE,
show_column_names = FALSE,
show_row_dend = FALSE,
column_names_gp = gpar(fontsize = 8),
column_names_centered = TRUE,
row_names_gp = gpar(fontsize = font_size),
row_names_side = "left",
border = TRUE,
border_gp = gpar(lwd = 0.5),
column_names_side = c("bottom"),
column_names_rot = 90,
column_title_gp = gpar(fontsize = 8),
row_title_gp = gpar(fontsize = 0),
top_annotation = c(column_ha_day,
column_ha_sex,
column_ha_detected,
column_ha_lineage,
column_ha_sum,
column_ha_individual,
# column_ha_sample,
gap = unit(0.1, "mm")),
use_raster = TRUE,
raster_device = "CairoPNG",
raster_quality = 10,
heatmap_legend_param = legends_param,
width = unit(h_width, "cm"),
height = unit(h_height, "cm"))
draw(ht1, merge_legend = TRUE)
}
my_genes <- all_genes %>%
filter(CT_gene_type == "CT_gene") %>%
filter(X_linked, regulated_by_methylation) %>%
filter(external_gene_name %in% rownames(sce))
controls <- all_genes %>%
filter(external_gene_name %in% c(#"GAPDH", "ACTB", "POU5F1", "XIST",
"DDX3Y", "RPS4Y1", "EIF1AY"))
X_linked <- all_genes %>%
filter(chr == "X") %>%
filter(external_gene_name %in% rownames(sce)) %>%
filter(!external_gene_name %in% CT_genes$external_gene_name) %>%
pull(external_gene_name)
chr1_linked <- all_genes %>%
filter(chr == 1) %>%
filter(external_gene_name %in% rownames(sce)) %>%
filter(!external_gene_name %in% CT_genes$external_gene_name) %>%
pull(external_gene_name)
draw_embryo_exp_Petropoulos_dataset <- function(genes,
detected_in_percent_of_cells = 0,
day = NULL,
font_size = 6,
gender = NULL,
scale = FALSE,
assayName = "normcounts",
scale_lims = NULL,
clustRow = TRUE,
h_width = 8,
h_height = 8,
show_right_annot = TRUE,
clust_method = "centroid"){
# Top Annotations
df_col <- colData(sce)
df_col <- df_col[!is.na(df_col$gender),]
if (!is.null(day)) df_col <- df_col[df_col$day %in% day, ]
df_col$sex <- df_col$gender
levels(df_col$sex) <- c("Male", "Female")
df_col <- df_col[order(df_col$day, df_col$sex, df_col$individual, df_col$lineage),]
df_col$group <- paste0(df_col$day, "_", df_col$gender)
column_ha_sex = HeatmapAnnotation(
sex = df_col$sex,
col = list(sex = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_individual = HeatmapAnnotation(
individual = df_col$individual,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_lineage = HeatmapAnnotation(
lineage = df_col$lineage,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_day = HeatmapAnnotation(
day = df_col$day,
#col = list(Stage = c("ICM" = "gray", "TE" = "gray10")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_sum = HeatmapAnnotation(
sum = df_col$sum,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_detected = HeatmapAnnotation(
detected = df_col$detected,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
n <- which(assayNames(sce) == assayName)
mat <- assay(sce, i = n)[genes, ]
# Show only genes detected in more than x% of cells
mat <- mat[rowSums(mat > 0) >= dim(mat)[2]*detected_in_percent_of_cells,]
# gene_type <- CT_genes_types %>%
# filter(gene %in% genes) %>%
# mutate(gene = factor(gene, levels = genes)) %>%
# arrange(gene)
if (scale) {
mat <- t(scale(t(mat), center = FALSE, scale = TRUE))
assayName = "scaled_exp"
}
if (is.null(scale_lims)) scale_lims <- c(0, quantile(mat, .9))
heatmap_colors <- colorRamp2(
seq(scale_lims[1], scale_lims[2], length = 11), legend_colors)
lines <- tibble(gene = rownames(mat)) %>%
mutate(group = case_when(gene %in% CT_genes$external_gene_name ~ "X-linked CG MethDep",
!gene %in% CT_genes$external_gene_name ~ "Control")) %>%
mutate(group = factor(group, levels = c("X-linked CG MethDep", "Control")))
ht1 <- Heatmap(mat[, rownames(df_col)],
name = assayName,
clustering_method_rows = "ward.D2",
clustering_method_columns = "ward.D2",
column_split = df_col$group,
row_split = lines$group,
row_gap = unit(c(1), "mm"),
col = heatmap_colors,
cluster_rows = clustRow,
cluster_columns = FALSE,
cluster_row_slices = FALSE,
cluster_column_slices = FALSE,
show_row_names = TRUE,
show_column_names = FALSE,
show_row_dend = FALSE,
column_names_gp = gpar(fontsize = 8),
column_names_centered = TRUE,
row_names_gp = gpar(fontsize = font_size),
row_names_side = "left",
border = TRUE,
border_gp = gpar(lwd = 0.5),
column_names_side = c("bottom"),
column_names_rot = 90,
column_title_gp = gpar(fontsize = 8),
row_title_gp = gpar(fontsize = 0),
top_annotation = c(column_ha_day,
column_ha_sex,
# column_ha_sample,
gap = unit(0.1, "mm")),
use_raster = TRUE,
raster_device = "CairoPNG",
raster_quality = 10,
heatmap_legend_param = legends_param,
width = unit(h_width, "cm"),
height = unit(h_height, "cm"))
draw(ht1, merge_legend = TRUE)
}
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, #"ACTB", "GAPDH",
"RPS26", "XIST"),
#day = "E6",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name,
controls$external_gene_name, #"ACTB", "GAPDH",
"RPS26", "XIST"),
#day = "E3",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
draw_embryo_exp_Petropoulos_dataset <- function(genes,
detected_in_percent_of_cells = 0,
day = NULL,
font_size = 6,
gender = NULL,
scale = FALSE,
assayName = "normcounts",
scale_lims = NULL,
clustRow = TRUE,
h_width = 8,
h_height = 8,
show_right_annot = TRUE,
clust_method = "centroid"){
# Top Annotations
df_col <- colData(sce)
df_col <- df_col[!is.na(df_col$gender),]
if (!is.null(day)) df_col <- df_col[df_col$day %in% day, ]
df_col$sex <- df_col$gender
levels(df_col$sex) <- c("Male", "Female")
df_col <- df_col[order(df_col$day, df_col$sex, df_col$individual, df_col$lineage),]
df_col$group <- paste0(df_col$day, "_", df_col$gender)
column_ha_sex = HeatmapAnnotation(
sex = df_col$sex,
col = list(sex = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_individual = HeatmapAnnotation(
individual = df_col$individual,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_lineage = HeatmapAnnotation(
lineage = df_col$lineage,
#col = list(individual = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_day = HeatmapAnnotation(
day = df_col$day,
#col = list(Stage = c("ICM" = "gray", "TE" = "gray10")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_sum = HeatmapAnnotation(
sum = df_col$sum,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_detected = HeatmapAnnotation(
detected = df_col$detected,
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
n <- which(assayNames(sce) == assayName)
mat <- assay(sce, i = n)[genes, ]
# Show only genes detected in more than x% of cells
mat <- mat[rowSums(mat > 0) >= dim(mat)[2]*detected_in_percent_of_cells,]
# gene_type <- CT_genes_types %>%
# filter(gene %in% genes) %>%
# mutate(gene = factor(gene, levels = genes)) %>%
# arrange(gene)
if (scale) {
mat <- t(scale(t(mat), center = FALSE, scale = TRUE))
assayName = "scaled_exp"
}
if (is.null(scale_lims)) scale_lims <- c(0, quantile(mat, .9))
heatmap_colors <- colorRamp2(
seq(scale_lims[1], scale_lims[2], length = 11), legend_colors)
lines <- tibble(gene = rownames(mat)) %>%
mutate(group = case_when(gene %in% CT_genes$external_gene_name ~ "X-linked CG MethDep",
!gene %in% CT_genes$external_gene_name ~ "Control")) %>%
mutate(group = factor(group, levels = c("X-linked CG MethDep", "Control")))
ht1 <- Heatmap(mat[, rownames(df_col)],
name = assayName,
clustering_method_rows = "ward.D2",
clustering_method_columns = "ward.D2",
column_split = df_col$group,
row_split = lines$group,
row_gap = unit(c(1), "mm"),
col = heatmap_colors,
cluster_rows = clustRow,
cluster_columns = FALSE,
cluster_row_slices = FALSE,
cluster_column_slices = FALSE,
show_row_names = TRUE,
show_column_names = FALSE,
show_row_dend = FALSE,
column_names_gp = gpar(fontsize = 8),
column_names_centered = TRUE,
row_names_gp = gpar(fontsize = font_size),
row_names_side = "left",
border = TRUE,
border_gp = gpar(lwd = 0.5),
column_names_side = c("bottom"),
column_names_rot = 90,
column_title_gp = gpar(fontsize = 8),
row_title_gp = gpar(fontsize = 0),
top_annotation = c(column_ha_day,
column_ha_sex,
column_ha_detected,
column_ha_lineage,
column_ha_sum,
column_ha_individual,
# column_ha_sample,
gap = unit(0.1, "mm")),
use_raster = TRUE,
raster_device = "CairoPNG",
raster_quality = 10,
heatmap_legend_param = legends_param,
width = unit(h_width, "cm"),
height = unit(h_height, "cm"))
draw(ht1, merge_legend = TRUE)
}
Focus on selected CG genes :
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, "ACTB", "GAPDH", "RPS26", "XIST"),
day = "E3",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked MethDep CG genes :
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name, controls$external_gene_name, "RPS26"),
day = "E3",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked other than CG genes:
draw_embryo_exp_Petropoulos_dataset(c(X_linked),
day = "E3",
clustRow = T,
#scale_lims = c(0, 500),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.5)
chr1-linked genes:
draw_embryo_exp_Petropoulos_dataset(c(chr1_linked),
day = "E3",
clustRow = T,
#scale_lims = c(0, 1),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.8)
Some Female cells (usually corresponding to the same embryo) express no CG genes,
but they also express some control genes at lower expression level
bad quality embryos ? (unlikely as no link with number of detected genes or sequencing depth)
expression embryo-specific?
linked with ZGA that has not occured yet?
Focus on selected CG genes :
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, "ACTB", "GAPDH", "RPS26", "XIST"),
day = "E4",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked MethDep CG genes :
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name, controls$external_gene_name, "RPS26"),
day = "E4",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked other than CG genes:
draw_embryo_exp_Petropoulos_dataset(c(X_linked),
day = "E4",
clustRow = T,
#scale_lims = c(0, 500),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.5)
chr1-linked genes:
draw_embryo_exp_Petropoulos_dataset(c(chr1_linked),
day = "E4",
clustRow = T,
#scale_lims = c(0, 1),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.8)
Focus on selected CG genes :
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, "ACTB", "GAPDH", "RPS26", "XIST"),
day = "E5",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked MethDep CG genes :
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name, controls$external_gene_name, "RPS26"),
day = "E5",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked other than CG genes:
draw_embryo_exp_Petropoulos_dataset(c(X_linked),
day = "E5",
clustRow = T,
#scale_lims = c(0, 500),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.5)
chr1-linked genes:
draw_embryo_exp_Petropoulos_dataset(c(chr1_linked),
day = "E5",
clustRow = T,
#scale_lims = c(0, 1),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.8)
Focus on selected CG genes :
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, "ACTB", "GAPDH", "RPS26", "XIST"),
day = "E6",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked MethDep CG genes :
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name, controls$external_gene_name, "RPS26"),
day = "E6",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked other than CG genes:
draw_embryo_exp_Petropoulos_dataset(c(X_linked),
day = "E6",
clustRow = T,
#scale_lims = c(0, 500),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.5)
chr1-linked genes:
draw_embryo_exp_Petropoulos_dataset(c(chr1_linked),
day = "E6",
clustRow = T,
#scale_lims = c(0, 1),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.8)
Focus on selected CG genes :
draw_embryo_exp_Petropoulos_dataset(c("MAGEC2", "MAGEB16", "VCX", "VCX2", "VCX3B", "DDX53", #"PAGE2", #"PAGE1",
controls$external_gene_name, "ACTB", "GAPDH", "RPS26", "XIST"),
day = "E7",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked MethDep CG genes :
draw_embryo_exp_Petropoulos_dataset(c(my_genes$external_gene_name, controls$external_gene_name, "RPS26"),
day = "E7",
clustRow = T,
scale_lims = c(0, 300),
detected_in_percent_of_cells = 0)
All X-linked other than CG genes:
draw_embryo_exp_Petropoulos_dataset(c(X_linked),
day = "E7",
clustRow = T,
#scale_lims = c(0, 500),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.5)
chr1-linked genes:
draw_embryo_exp_Petropoulos_dataset(c(chr1_linked),
day = "E7",
clustRow = T,
#scale_lims = c(0, 1),
scale = T,
font_size = 6,
detected_in_percent_of_cells = 0.8)
sce$ind_lineage <- paste0(sce$individual, '_', sce$lineage)
dds <- scuttle::summarizeAssayByGroup(sce,
ids = colData(sce)[, c("ind_lineage")],
statistics = "sum")
coldata <- tibble(ind_lineage = dds$ids,
ncells = dds$ncells) %>%
left_join(as_tibble(colData(sce)) %>%
dplyr::select(ind_lineage, day, stage, gender, lineage) %>% unique()) %>%
as.data.frame()
# remove E3
coldata <- coldata[coldata$day %in% c("E4", "E5", "E6", "E7"),]
#table(coldata$gender, coldata$ind_lineage)
# Remove embryo with ambigous gender
#coldata <- coldata[!coldata$ambigous, ]
rownames(coldata) <- coldata$ind_lineage
# table(ncells = coldata$ncells, day = coldata$day)
# table(ncells = coldata$ncells, day = coldata$day, coldata$gender)
number_cells_thr <- 3
n <- length(unique(dds$ids))
as_tibble(colData(dds)) %>%
mutate(day = substr(start = 1, stop = 2, ids),
embryo = gsub("_.*", x = ids, replace = '' )) %>%
ggplot(aes( x = day, y = ncells, color = day)) +
geom_jitter() +
# scale_color_manual(values = sample(scales::hue_pal()(n)))+
theme(legend.position = "none") +
geom_hline(yintercept = number_cells_thr, linetype = "dashed")
Remove Individuals with less than 3 cells
Removed day3 (ZGA not fully completed yet)
design = gender + lineage + day
# Remove Individual with less than x cells
coldata <- coldata[coldata$ncells >= number_cells_thr,]
#table(ncells = coldata$ncells, day = coldata$day, coldata$gender, coldata$lineage)
dds <- dds[, coldata$ind_lineage]
dds$day <- coldata$day
dds$stage <- coldata$stage
dds$gender <- coldata$gender
dds$lineage <- coldata$lineage
dds$ambigous <- coldata$ambigous
colData(dds)$group <- paste0(dds$day, "_", dds$lineage, "_", dds$gender)
dds <- DESeqDataSetFromMatrix(countData = assay(dds[, rownames(colData(dds))]),
colData = colData(dds),
design = ~ gender + lineage + day)
dds$day <- factor(dds$day)
dds$gender <- factor(dds$gender, c(levels = c("M", "F")))
table(dds$day, dds$gender)
##
## M F
## E4 7 7
## E5 23 20
## E6 10 17
## E7 14 17
table(dds$lineage, dds$day, dds$gender)
## , , = M
##
##
## E4 E5 E6 E7
## epiblast 0 3 1 3
## not applicable 7 8 0 0
## primitive endoderm 0 2 2 4
## trophectoderm 0 10 7 7
##
## , , = F
##
##
## E4 E5 E6 E7
## epiblast 0 4 4 4
## not applicable 7 7 0 0
## primitive endoderm 0 2 4 3
## trophectoderm 0 7 9 10
dds <- estimateSizeFactors(dds)
filtering_thr <- 10
keep <- rowSums(counts(dds, normalize = TRUE) >= filtering_thr ) >= 5
table(keep)
## keep
## FALSE TRUE
## 5832 18583
dds <- DESeq(dds[keep,])
#dds <- DESeq(dds)
as_tibble(colData(dds)) %>%
ggplot(aes(x = ids, y = sizeFactor, fill = gender)) +
geom_col()+
theme(axis.text.x = element_text(angle = 90, size = 5, hjust = 1, vjust = .5)) +
facet_wrap(~ day, scales = "free_x", ncol = 4)
## PCA
rld <- varianceStabilizingTransformation(dds)
rv <- rowVars(assay(rld))
selected <- order(rv, decreasing = TRUE)[seq_len(min(500, length(rv)))]
pca <- prcomp(t(assay(rld)[selected, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
percentVar_1 <- paste0("PC1: ", round(percentVar[1] * 100), "% variance")
percentVar_2 <- paste0("PC2: ", round(percentVar[2] * 100), "% variance")
percentVar_3 <- paste0("PC3: ", round(percentVar[3] * 100), "% variance")
percentVar_4 <- paste0("PC4: ", round(percentVar[4] * 100), "% variance")
pcaData <- as_tibble(pca$x, rownames = "samples") %>%
dplyr::select(samples, "PC1", "PC2", "PC3", "PC4") %>%
left_join(as_tibble(colData(dds), rownames = "samples"))
ggplot(pcaData, aes(x = PC1, y = PC2, col = lineage, shape = gender)) +
geom_point(size = 2.5) +
xlab(percentVar_1) +
ylab(percentVar_2) +
facet_wrap(~ day)
ggplot(pcaData, aes(x = PC1, y = PC2, col = sizeFactor, shape = gender)) +
geom_point(size = 2.5) +
scale_color_continuous(type = "viridis") +
xlab(percentVar_1) +
ylab(percentVar_2) +
facet_wrap(~ day)
# ggplot(pcaData, aes(x = PC3, y = PC4, col = lineage, shape = gender)) +
# geom_point(size = 2.5) +
# xlab(percentVar_3) +
# ylab(percentVar_4) +
# facet_wrap(~ day)
=> Separation by day and by lineage
load("~/cluster/CBIO-templates/ensembl_2_geneName/transcripts_infos.rda")
transcripts_infos <- transcripts_infos %>% filter(chromosome_name %in% c(1:22, "X", "Y", "MT")) %>%
dplyr::select(external_gene_name, chromosome_name) %>% unique()
res_dds <- results(dds,
name = "gender_F_vs_M",
independentFiltering = FALSE)
res_tbl <- as_tibble(res_dds, rownames = "external_gene_name") %>%
left_join(transcripts_infos) %>%
arrange(padj)
# res_dds <- results(dds,
# name = "gender_female_vs_male",
# independentFiltering = TRUE)
# res_shr <- lfcShrink(dds, coef = "gender_F_vs_M")
# res_shr_tbl <- as_tibble(res_shr, rownames = "external_gene_name") %>%
# left_join(transcripts_infos) %>%
# arrange(padj)
#save(res_tbl, file = "../tmp_data/res_tbl_gender_F_vs_M.rda")
#save(res_shr_tbl, file = "../tmp_data/res_shr_tbl_gender_F_vs_M.rda")
padj_thr <- 0.05
log2FC_thr <- 1
ggplot(res_tbl %>% filter(!is.na(padj)), aes(x = pvalue)) +
geom_histogram(bins = 20, boundary = 0) +
ggtitle("pvalues histogram")
labeled_genes <- res_tbl %>%
filter(chromosome_name != "Y") %>%
filter(padj < padj_thr) %>%
filter(external_gene_name %in% CT_genes$external_gene_name) %>%
#filter(abs(log2FoldChange) > log2FC_thr ) %>%
arrange(padj) #%>% head(30)
res_tbl %>%
filter(!is.na(padj)) %>%
ggplot(aes(x= log2FoldChange, y = -log10(padj))) +
geom_point(size = 0.5) +
geom_point(data = res_tbl %>%
filter(chromosome_name == "X"),
aes(x= log2FoldChange, y = -log10(padj)),
color = "steelblue", size = 0.55) +
geom_point(data = res_tbl %>%
filter(external_gene_name %in% CT_genes$external_gene_name),
aes(x= log2FoldChange, y = -log10(padj)),
color = "red", size = 0.55) +
geom_point(data = res_tbl %>%
filter(chromosome_name == "Y"),
aes(x= log2FoldChange, y = -log10(padj)),
color = "black", size = 0.55) +
# xlab("log2FoldChange") +
ylab("-log10(padj)") +
ggrepel::geom_text_repel(data = res_tbl %>% filter(padj < 0.05 & log2FoldChange > 3),
aes(x = log2FoldChange, y = -log10(padj), label = external_gene_name),
size = 2,
segment.size = 0.1, max.overlaps = 30) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_hline(yintercept = -log10(padj_thr), linetype = "dashed", color = "gray70")
load("~/cluster/Packages/CTdata/eh_data/CT_genes.rda")
CTP_genes <- CT_genes %>%
filter(CT_gene_type == "CTP_gene")
CT_genes <- CT_genes %>%
filter(CT_gene_type == "CT_gene")
thr <- 1e-35
logFC_thr <- 8
res_tbl_modified <- res_tbl %>%
mutate(group = case_when(padj < thr ~ "padj_outlier",
abs(log2FoldChange) >= logFC_thr ~ "logFC_outlier",
padj > thr & abs(log2FoldChange) < logFC_thr ~ "others")) %>%
mutate(padj = ifelse(padj <= thr, thr, padj)) %>%
mutate(log2FoldChange = ifelse(log2FoldChange >= logFC_thr, logFC_thr, log2FoldChange)) %>%
filter(chromosome_name %in% c(1:22, "X", "Y")) %>%
mutate(chr = case_when(chromosome_name %in% c("X", "Y") ~ chromosome_name,
!chromosome_name %in% c("X", "Y") ~ "Autosome")) %>%
mutate(X_CG_MethDEp = case_when(external_gene_name %in% CT_genes$external_gene_name ~ TRUE,
external_gene_name %in% CT_genes$external_gene_name ~ FALSE)) %>%
filter(!is.na(padj)) %>%
filter(!is.na(group))
#filter(abs(log2FoldChange) > 0.05)
labeled_X_genes <- res_tbl_modified %>%
filter(chr == "X") %>%
filter(abs(log2FoldChange) > 5 & padj < 0.00005) %>%
filter(!external_gene_name %in% CT_genes$external_gene_name) %>%
#filter(external_gene_name %in% CT_genes$external_gene_name) %>%
#filter(abs(log2FoldChange) > log2FC_thr ) %>%
arrange(padj)
labeled_CT_genes <- res_tbl_modified %>%
#filter(chr != "Y") %>%
filter(external_gene_name %in% CT_genes_X_met & padj < 0.0005 & log2FoldChange > 2.5) %>%
#filter(external_gene_name %in% CT_genes$external_gene_name) %>%
#filter(abs(log2FoldChange) > log2FC_thr ) %>%
arrange(padj)
X_sign_genes <- res_tbl_modified %>%
filter(chr == "X") %>%
filter(padj < padj_thr) %>%
arrange(padj) #%>% head(30)
Y_sign_genes <- res_tbl_modified %>%
filter(chr == "Y") %>%
filter(padj < padj_thr) %>%
filter(log2FoldChange < 0 ) %>%
arrange(padj) #%>% head(30)
set.seed = 123
fig <- res_tbl_modified %>%
ggplot(aes(x= log2FoldChange, y = -log10(padj))) +
geom_point(data = res_tbl_modified,
aes(x= log2FoldChange, y = -log10(padj), shape = group),
size = 1, color = "gray80") +
geom_point(data = res_tbl_modified %>%
filter(external_gene_name %in% X_sign_genes$external_gene_name),
aes(x= log2FoldChange, y = -log10(padj), shape = group),
size = 1, color = "steelblue") +
geom_point(data = res_tbl_modified %>%
filter(external_gene_name %in% Y_sign_genes$external_gene_name),
aes(x= log2FoldChange, y = -log10(padj), shape = group),
size = 1, color = "black") +
geom_point(data = res_tbl_modified %>%
filter(external_gene_name %in% CT_genes_X_met),# & padj < padj_thr),
aes(x= log2FoldChange, y = -log10(padj), shape = group),
size = 1.2, color = "firebrick2") +
scale_shape_manual(values = c(15, 19, 17))+
# c("padj_outlier" = 17,
# "others" = 15,
# "logFC_outlier" = 19))+
xlab("log2FoldChange") +
ylab("-log10(padj)") +
geom_vline(xintercept = 0, linetype = "dashed",
color = "black", linewidth = 0.2) +
# geom_vline(xintercept = -1, linetype = "dashed",
# color = "black", linewidth = 0.2) +
geom_hline(yintercept = -log10(padj_thr),
linetype = "dashed", color = "black", linewidth = 0.2) +
ggrepel::geom_text_repel(data = subset(res_tbl_modified, external_gene_name %in% labeled_X_genes$external_gene_name),
aes(x = log2FoldChange, y = -log10(padj), label = external_gene_name),
size = 3, color = "steelblue",
segment.size = 0.1, max.overlaps = 10) +
ggrepel::geom_text_repel(data = subset(res_tbl_modified, external_gene_name %in% labeled_CT_genes$external_gene_name),
aes(x = log2FoldChange, y = -log10(padj), label = external_gene_name),
size = 3, color = "firebrick2",
segment.size = 0.1, max.overlaps = 30) +
xlim(-logFC_thr,logFC_thr) +
ggtitle("Petropoulos dataset") +
theme_bw() +
theme(legend.position = "none",
title = element_text(size = 10),
panel.grid = element_blank())
fig
pdf(file = "../figs/FigS2_Volcano_Petropoulos_dataset.pdf", width = 4, height = 4)
fig
dev.off()
## png
## 2
Genes with a padjusted value < 10^{-35} are represented as triangles
Genes with an absolute log2FC > 8 are represented as squares
Red dots = X-linked MethDep genes
Blue dots = X-linked genes not CG genes
Black dots = Y-linked genes
Gray dots = other genes
# pdf(file = paste0("../figs/figs_CT_in_embryo/Fig_Female_vs_Male_volcano_Zhu.pdf"),
# width = 5, height = 4)
# fig
# dev.off()
# #idem but shrinked logFC:
#
# res_shr_tbl %>%
# filter(!is.na(padj)) %>%
# ggplot(aes(x= log2FoldChange, y = -log10(padj))) +
# geom_point(size = 0.5) +
# geom_point(size = 0.5) +
# geom_point(data = res_shr_tbl %>%
# filter(chromosome_name == "X"),
# aes(x= log2FoldChange, y = -log10(padj)),
# color = "steelblue", size = 0.55) +
# geom_point(data = res_shr_tbl %>%
# filter(external_gene_name %in% CT_genes$external_gene_name),
# aes(x= log2FoldChange, y = -log10(padj)),
# color = "red", size = 0.55) +
#
# geom_point(data = res_shr_tbl %>%
# filter(chromosome_name == "Y"),
# aes(x= log2FoldChange, y = -log10(padj)),
# color = "black", size = 0.55) +
# # xlab("log2FoldChange") +
# ylab("-log10(padj)") +
# ggrepel::geom_text_repel(data = subset(res_shr_tbl, external_gene_name %in% CT_genes$external_gene_name),
# aes(x = log2FoldChange, y = -log10(padj), label = external_gene_name),
# size = 2,
# segment.size = 0.1, max.overlaps = 30) +
# geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
# geom_hline(yintercept = -log10(padj_thr), linetype = "dashed", color = "gray70")
ordered_samples <- as_tibble(colData(dds)) %>%
arrange(gender, day) %>%
pull(ids)
plot_counts <- function(my_gene, order = ordered_samples){
as_tibble(counts(dds[my_gene,], normalize = T), rownames = 'gene') %>%
gather(ids, counts, -gene) %>%
left_join(as_tibble(colData(dds))) %>%
mutate(sample = factor(ids, levels = order)) %>%
# mutate(gene = mapIds(org.Hs.eg.db, ENSEMBL, "SYMBOL", "ENSEMBL")) %>%
# mutate(Gene = case_when(is.na(gene) ~ ENSEMBL,
# !is.na(gene) ~ gene)) %>%
ggplot(aes(x = sample, y = counts, fill = gender)) +
geom_bar(stat = 'identity')+
facet_wrap( ~ gene, scales = "free") +
theme(axis.text.x = element_text(size = 6, angle = 90, vjust = 0.5, hjust = 1)) +
xlab(element_blank()) +
ylab("Normalized counts")
}
plot_counts("VCX2")
plot_counts(c("MAGEC2"))
# plot_counts(res_tbl %>% filter(padj < 0.05) %>% arrange(desc(log2FoldChange)) %>% pull(external_gene_name) %>% head(16))
# plot_counts("MAGEC2")
# plot_counts("XIST")
# plot_counts("POU4F1")
plot_counts_by_day <- function(my_gene, order = ordered_samples){
as_tibble(counts(dds[my_gene,], normalize = T), rownames = 'gene') %>%
gather(ids, counts, -gene) %>%
left_join(as_tibble(colData(dds))) %>%
mutate(sample = factor(ids, levels = order)) %>%
# mutate(gene = mapIds(org.Hs.eg.db, ENSEMBL, "SYMBOL", "ENSEMBL")) %>%
# mutate(Gene = case_when(is.na(gene) ~ ENSEMBL,
# !is.na(gene) ~ gene)) %>%
ggplot(aes(x = sample, y = counts, fill = gender)) +
#scale_fill_manual(values = list("M" = "steelblue", "F" = "deeppink"))+
geom_bar(stat = 'identity')+
facet_wrap( ~ day, scales = "free") +
theme(axis.text.x = element_text(size = 6, angle = 90, vjust = 0.5, hjust = 1)) +
xlab(element_blank()) +
ylab("Normalized counts")
}
plot_counts_by_day("MAGEC2")
# pdf(file = paste0("../figs/figs_CT_in_embryo/Fig_Female_vs_Male_volcano_petropoulos_E4_E7.pdf"),
# width = 5, height = 4)
# fig
# dev.off()
Expression of all X-linked_methDep significantly DE between females and males
my_genes <- labeled_CT_genes %>%
filter(log2FoldChange > 2)
plot_counts <- function(my_gene, order = ordered_samples){
as_tibble(counts(dds[my_gene,], normalize = T), rownames = 'gene') %>%
gather(ids, counts, -gene) %>%
left_join(as_tibble(colData(dds))) %>%
mutate(sample = factor(ids, levels = order)) %>%
# mutate(gene = mapIds(org.Hs.eg.db, ENSEMBL, "SYMBOL", "ENSEMBL")) %>%
# mutate(Gene = case_when(is.na(gene) ~ ENSEMBL,
# !is.na(gene) ~ gene)) %>%
ggplot(aes(x = sample, y = counts, fill = gender)) +
geom_bar(stat = 'identity')+
facet_wrap( ~ gene, scales = "free") +
theme(axis.text.x = element_text(size = 0, angle = 90, vjust = 0.5, hjust = 1)) +
xlab(element_blank()) +
ylab("Normalized counts")
}
plot_counts(my_genes$external_gene_name)
legends_param <- list(
labels_gp = gpar(col = "black", fontsize = 8),
title_gp = gpar(col = "black", fontsize = 8, fontface = "bold"),
simple_anno_size = unit(0.1, "cm"),
row_names_gp = gpar(fontsize = 1),
annotation_name_side = "left",
border = FALSE,
border_gp = gpar(lwd = 0.1),
grid_width = unit(0.1, "cm"),
grid_height = unit(0.1, "cm"),
legend_height = unit(0.1, "cm"))
legend_colors <- c(
"#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
"#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
"#9E0142")
ensembl_gene <- data.frame(gene = all_genes$external_gene_name)
rownames(ensembl_gene) <- all_genes$ensembl_gene_id
draw_embryo_exp_Petropoulos_dataset <- function(genes,
font_size = 6,
scale = FALSE,
log_transform = TRUE,
scale_lims = NULL,
clustRow = TRUE,
h_width = 8,
h_height = 8,
show_right_annot = TRUE,
clust_method = "centroid"){
# Top Annotations
df_col <- colData(dds)
df_col$sex <- df_col$gender
levels(df_col$sex) <- c("Male", "Female")
df_col <- df_col[order(df_col$sex, df_col$sex),]
df_col$day_sex <- paste0(df_col$day, '_', df_col$gender)
column_ha_sex = HeatmapAnnotation(
sex = df_col$sex,
col = list(sex = c("Female" = "deeppink", "Male" = "steelblue")),
border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
column_ha_day = HeatmapAnnotation(
day = df_col$day,
col = list(day = c("E4" = "aliceblue", "E4" = "moccasin", "E5" = "orange3", "E6" = "firebrick3", "E7" = "brown4")),
# border = FALSE,
simple_anno_size = unit(0.4, "cm"),
annotation_legend_param = legends_param,
annotation_name_gp = gpar(fontsize = 0))
genes <- genes[genes %in% rownames(dds)]
mat <- counts(dds[genes, ], normalize = TRUE)
#rownames(mat) <- ensembl_gene[genes, ]
units = "Counts"
if (log_transform) {
mat <- log1p(mat)
units = "log1p(Counts)"
}
if (scale) {
mat <- t(scale(t(mat), center = FALSE, scale = TRUE))
units = "scaled_exp"
}
# gene_type <- CT_genes_types %>%
# filter(gene %in% genes) %>%
# mutate(gene = factor(gene, levels = genes)) %>%
# arrange(gene)
if (is.null(scale_lims)) scale_lims <- c(0, max(mat))
heatmap_colors <- colorRamp2(
seq(scale_lims[1], scale_lims[2], length = 11), legend_colors)
ht1 <- Heatmap(mat[, rownames(df_col)],
name = units,
clustering_method_rows = "ward.D",
clustering_method_columns = "ward.D",
column_split = df_col$day,
row_gap = unit(c(1), "mm"),
col = heatmap_colors,
cluster_rows = clustRow,
cluster_columns = FALSE,
cluster_row_slices = FALSE,
cluster_column_slices = FALSE,
show_row_names = TRUE,
show_column_names = FALSE,
show_row_dend = FALSE,
column_names_gp = gpar(fontsize = 8),
column_names_centered = TRUE,
row_names_gp = gpar(fontsize = font_size),
row_names_side = "left",
border = TRUE,
border_gp = gpar(lwd = 0.5),
column_names_side = c("bottom"),
column_names_rot = 90,
column_title_gp = gpar(fontsize = 12),
row_title_gp = gpar(fontsize = 0),
top_annotation = c(column_ha_sex,
#column_ha_day,
gap = unit(0.1, "mm")),
use_raster = TRUE,
raster_device = "CairoPNG",
raster_quality = 10,
heatmap_legend_param = legends_param,
width = unit(h_width, "cm"),
height = unit(h_height, "cm"))
draw(ht1, merge_legend = TRUE)
}
Expression Heatmap of all X-linked_methDep
draw_embryo_exp_Petropoulos_dataset(CT_genes_X_met, clustRow = T, log_transform = FALSE, scale_lims = c(0,200))
# #idem but log_transformed:
# draw_embryo_exp_Petropoulos_dataset(CT_genes_X_met, clustRow = T, log_transform = TRUE)
Expression Heatmap of all X-linked_methDep significantly DE between females and males
and with logFC > 3
my_genes <- labeled_CT_genes %>%
filter(log2FoldChange > 3)
draw_embryo_exp_Petropoulos_dataset(my_genes$external_gene_name,
clustRow = T,
h_width = 8,
h_height = 4,
log_transform = F,
scale_lims = c(0, 200))
as_tibble(counts(dds[my_genes$external_gene_name,]), rownames = "gene") %>%
pivot_longer(names_to = "embryo", values_to = "counts", -gene) %>%
left_join(as_tibble(colData(dds), rownames = "embryo")) %>%
ggplot(aes(x = group, y = log1p(counts), color = gender)) +
scale_color_manual(values = c("M" = "blue", "F" = "deeppink")) +
#geom_jitter(size = 1) +
geom_boxplot(alpha = 0, outliers = FALSE) +
facet_wrap(~ gene, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
as_tibble(counts(dds["MAGEC2",]), rownames = "gene") %>%
pivot_longer(names_to = "embryo", values_to = "counts", -gene) %>%
left_join(as_tibble(colData(dds), rownames = "embryo")) %>%
ggplot(aes(x = group, y = log1p(counts), color = gender)) +
scale_color_manual(values = c("M" = "blue", "F" = "deeppink")) +
#geom_jitter(size = 1) +
geom_boxplot(alpha = 0, outliers = FALSE) +
facet_wrap(~ gene, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 5))
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] scran_1.34.0 scater_1.34.1
## [3] scuttle_1.16.0 DropletUtils_1.26.0
## [5] SingleCellExperiment_1.28.1 DESeq2_1.46.0
## [7] SummarizedExperiment_1.36.0 Biobase_2.66.0
## [9] MatrixGenerics_1.18.1 matrixStats_1.5.0
## [11] GenomicRanges_1.58.0 GenomeInfoDb_1.42.3
## [13] IRanges_2.40.1 S4Vectors_0.44.0
## [15] BiocGenerics_0.52.0 circlize_0.4.16
## [17] CTexploreR_1.2.0 CTdata_1.6.0
## [19] ComplexHeatmap_2.22.0 lubridate_1.9.4
## [21] forcats_1.0.0 stringr_1.5.1
## [23] purrr_1.0.4 readr_2.1.5
## [25] tidyr_1.3.1 tibble_3.2.1
## [27] ggplot2_3.5.1 tidyverse_2.0.0
## [29] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1
## [3] jsonlite_1.9.1 shape_1.4.6.1
## [5] magrittr_2.0.3 magick_2.8.6
## [7] ggbeeswarm_0.7.2 farver_2.1.2
## [9] rmarkdown_2.29 GlobalOptions_0.1.2
## [11] zlibbioc_1.52.0 vctrs_0.6.5
## [13] Cairo_1.6-2 memoise_2.0.1
## [15] DelayedMatrixStats_1.28.1 htmltools_0.5.8.1
## [17] S4Arrays_1.6.0 AnnotationHub_3.14.0
## [19] curl_6.2.2 BiocNeighbors_2.0.1
## [21] Rhdf5lib_1.28.0 SparseArray_1.6.2
## [23] rhdf5_2.50.2 sass_0.4.9
## [25] bslib_0.9.0 cachem_1.1.0
## [27] igraph_2.1.4 lifecycle_1.0.4
## [29] iterators_1.0.14 pkgconfig_2.0.3
## [31] rsvd_1.0.5 Matrix_1.7-3
## [33] R6_2.6.1 fastmap_1.2.0
## [35] GenomeInfoDbData_1.2.13 clue_0.3-66
## [37] digest_0.6.37 colorspace_2.1-1
## [39] AnnotationDbi_1.68.0 dqrng_0.4.1
## [41] irlba_2.3.5.1 ExperimentHub_2.14.0
## [43] RSQLite_2.3.9 beachmat_2.22.0
## [45] labeling_0.4.3 filelock_1.0.3
## [47] timechange_0.3.0 httr_1.4.7
## [49] abind_1.4-8 compiler_4.4.1
## [51] bit64_4.6.0-1 withr_3.0.2
## [53] doParallel_1.0.17 BiocParallel_1.40.0
## [55] viridis_0.6.5 DBI_1.2.3
## [57] HDF5Array_1.34.0 R.utils_2.13.0
## [59] rappdirs_0.3.3 DelayedArray_0.32.0
## [61] bluster_1.16.0 rjson_0.2.23
## [63] tools_4.4.1 vipor_0.4.7
## [65] beeswarm_0.4.0 R.oo_1.27.0
## [67] glue_1.8.0 rhdf5filters_1.18.1
## [69] cluster_2.1.8.1 generics_0.1.3
## [71] gtable_0.3.6 tzdb_0.5.0
## [73] R.methodsS3_1.8.2 hms_1.1.3
## [75] metapod_1.14.0 BiocSingular_1.22.0
## [77] ScaledMatrix_1.14.0 XVector_0.46.0
## [79] ggrepel_0.9.6 BiocVersion_3.20.0
## [81] foreach_1.5.2 pillar_1.10.1
## [83] limma_3.62.2 BiocFileCache_2.14.0
## [85] lattice_0.22-6 bit_4.6.0
## [87] tidyselect_1.2.1 locfit_1.5-9.12
## [89] Biostrings_2.74.1 knitr_1.50
## [91] gridExtra_2.3 edgeR_4.4.2
## [93] xfun_0.51 statmod_1.5.0
## [95] stringi_1.8.4 UCSC.utils_1.2.0
## [97] yaml_2.3.10 evaluate_1.0.3
## [99] codetools_0.2-20 BiocManager_1.30.25
## [101] cli_3.6.4 munsell_0.5.1
## [103] jquerylib_0.1.4 Rcpp_1.0.14
## [105] dbplyr_2.5.0 png_0.1-8
## [107] parallel_4.4.1 blob_1.2.4
## [109] sparseMatrixStats_1.18.0 viridisLite_0.4.2
## [111] scales_1.3.0 crayon_1.5.3
## [113] GetoptLong_1.0.5 rlang_1.1.5
## [115] cowplot_1.1.3 KEGGREST_1.46.0